% =========== Leave One Patient Out (LOPO) Cross Validation ==============
% Data-Driven Identification of Prognostic Tumor Subpopulations Using 
% Spatially Mapped t-SNE of Mass Spectrometry Imaging Data
% ============================ About LOPO =================================
% This file may takes long time to finish LOPO analysis for all patients
% A patient is exluded and the entire pipeline is applied, and then that
% patient is eventually used for clinical outcome prediction (i.e. to be 
% assigned to a certain survival group). Note that this is a douple loop
% corss validation in which the left patient was unseen during feature
% extraction (i.e. steps of tSNE, discretization and SAM analysis) and
% classifier training. LOPO prevents any information leakage during the
% cross validatio, and thus the final result is not biased.
% groups)
% Note: Some parts of this pipeline use R implementation. Please, install R and a
% proper toolbox to connect R&Matlab.
% Note: Please run this file (LOPO.0 to LOPO.6) cell-by-cell (Ctrl+Enter).
% Also note that step (LOPO.5) needs an interactive effort to run it
% properly as you have to run an R implementation in the file
% "SAM_GastricCancer_R.txt" to get the significant m/z features.
% (look first at SAM_GastricCancer_R.txt, if it is needed then change the value of parameter "delta")
% ========================================================================
% Written By Walid M. Abdelmoula, LUMC, NL

% Running the tSNE on the Tumor heterogeneity on each patient seperately:
clear all, clc
load('../gastric_cancer_dataset.mat');
addpath('../../Matlab_Files/Functions/');
addpath('../../Matlab_Files/Functions/kaplanmeier_plot/');
addpath('../../Matlab_Files/RviaMatlab/');

%% LOPO.0 Load Data:
S = size(MSI_data_cube);
N_Patients = length(Clinical_data);
N_Masses = S(3);
Folded_Measurements = reshape(MSI_data_cube, [S(1)*S(2) S(3)]); 
Tumors_All = Folded_Measurements((goodlist>0),:);
indx = find((goodlist>0));

%% LOPO.1 Dimensionality Reduction: Leave one patient out and build a new tSNE
for i =1:N_Patients
    SelectedPatient = i;
    ii = find(pixel_to_sample_ID(goodlist>0)~=SelectedPatient); %exclude this patient
    Tumors_PatientExcluded = Tumors_All(ii,:);
    mappedX_3D_PatientExc = fast_tsne_seed(Tumors_PatientExcluded,3);
    mappedX2_3D_PatientExc = fast_tsne_seed(Tumors_PatientExcluded, 3, [], [], [], [], mappedX_3D_PatientExc);
    File_Name = strcat('tSNE_Maps_ Leave_One_Patient\LeavePatient_',num2str(PatientID(i)),'.mat');
    save(File_Name,'mappedX2_3D_PatientExc');
    clear mappedX2_3D_PatientExc mappedX_3D_PatientExc mappedX2_3D_PatientExc
end
%% LOPO1.1 Load tSNE maps reconstructed from excluding one patient
SelectedPatient = 28; %ID of the excluded patient
File_Name = strcat('tSNE_Maps_ Leave_One_Patient\LeavePatient_',num2str(SelectedPatient),'.mat');
load(File_Name);
pixel_to_sample_ID_New = find(pixel_to_sample_ID(goodlist>0)~=SelectedPatient);
labc_PatientOut = embedding2LABcolormap(mappedX2_3D_PatientExc);
% figure,scatter3(mappedX2_3D_PatientExc(:,1),mappedX2_3D_PatientExc(:,2),mappedX2_3D_PatientExc(:,3),3,labc_PatientOut); grid off
indx_New = indx(pixel_to_sample_ID_New);
tSNE_SegmentationMap_OnePatientOut = Visualize_tSNE_2DImage(mappedX2_3D_PatientExc,S(1),S(2),indx_New);
close;
%% LOPO.2 ===== Find the optimal number of clusters from the tSNE Space using Bisecting Kmeans
[LOU_K_Clusters, LOU_Corr_Values, LOU_IDX, LOU_C] = Optimal_NumberClusters(mappedX2_3D_PatientExc,S,indx_New);
[Ranked_Correlation, LOUT_Ranked_Index] = sort(LOU_Corr_Values,'descend');
%% LOPO.3 Linik to clinical data: Assign pixels to components and associated
%  components to sample that n a minimum fraction (> n/k *100% pixels) of that sample
LOUT_Rank_Order = 1; %1st ranked peak
LOUT_K_ranked = LOUT_Ranked_Index(LOUT_Rank_Order) %K-clusters corresponding to the 1st ranked peak
opt=[1E-6 1 1];
[LOUT_rIDX,LOUT_rC,rCovMat,rDmat,rCo]=MyKmeans(mappedX2_3D_PatientExc,LOUT_K_ranked,opt);
[LOUT_Kmeans_SegmentationMap, LOUT_IDXs, Cs, Color_Scheme] = Visualize_combined_ClusteredImage(LOUT_rIDX,LOUT_rC,S,indx_New,LOUT_K_ranked);
%-------------------------------------------
Patient_Tissues = pixel_to_sample_ID(goodlist == 1);
pixel_to_sample_ID_New = Patient_Tissues(Patient_Tissues~=SelectedPatient); %Exclude this selected patient (PatientID)
[LOUT_sample_to_component, LOUT_pixel_to_component, LOUT_IDXs_Thresholded] = assign_regions(LOUT_IDXs,LOUT_K_ranked, pixel_to_sample_ID_New);
LOUT_nr_comps = length(unique(LOUT_pixel_to_component))-1; %(-1) to avoid zero
% ====== color tSNE based on the K-clusters ======
[LOUT_RGB_COLORS_OfClusters,colstr] = RGB_Color_Ncomponents(LOUT_nr_comps);
plots_dir = pwd;
figure, scatter3(mappedX2_3D_PatientExc(:,1),mappedX2_3D_PatientExc(:,2),mappedX2_3D_PatientExc(:,3),3,LOUT_IDXs);
colormap(LOUT_RGB_COLORS_OfClusters);grid off
% Kaplan-Meier: Survival Analysis
LOUT_RunSurvivalAnalysis 
%% LOPO.4 Investigate survivals between two groups: Statistical significance
% assign cluster ID that is associated with either good or bad survivals
% Note: Look at surv plot (LOUT_KM_plot_k_.png) before run this function to
% set the below IDs of good and poor survivals (ID_goodSurv & ID_lowSurv)
close;
[LOU_ColorMap, colstr] = MyColorMap(LOUT_nr_comps); 
LOU_DataSetName = strcat('LOU_jdata_',num2str(LOUT_nr_comps),'.txt');
ID_goodSurv = 3;
ID_lowSurv = 1;
[LOU_goodSurv, LOU_badSurv] = InvestigateSurvTwoGroups(ID_goodSurv,ID_lowSurv,LOUT_IDXs_Thresholded,S,indx_New,LOU_DataSetName,LOU_ColorMap);
%% LOPO.5 SAM analysis on leave-one-out
% Cluster_ID: represents the tumor subpopulations we are interested to retrieve its prognostic signature
clear MZ_Average_ConcatenatedSubpop
IDXs_Values = LOUT_IDXs_Thresholded;
indxoo = indx_New;
MultiClass_MultiLabeling = 0;
Triple_Labeling = 1; %(i.e. Poor, Medium, High Survivals)
SelectedSubpop = unique(LOUT_IDXs);
Low_Surv = 1;   % look at the survival curves resulted from step4 to set the ID of Poor and High Survival subpop
High_Surv = 3;
SAM_LeaveOneOut
% Please make sure to save the significant m/z resulted from running SAM
% repeated for each excluded patients (e.g. Assume you saved it in a file
% called SAM_LOPO.xlsx). As I have already run this analysis before, so I've 
% already saved it in the attached file "SAM_LOPO.mat"
close all
%% LOPO.6 ======== LOUT: Classification ===========
% Result of SAM method of previous step is a feature vector for significant
% m/z values that achieved FDR<=0.001. Look at that result and copy it to
% the below variable Mzs:

Mzs = [4967,3445,3374,5003,5173,4940,3409,4748,4912,3711,4788,10843,5145,3670,13166]; 

%Build a KNN aclassifier with three labels (1: low, 2: medium, 3: high) survivals
clear NewLabels_IDXs
NewLabels_IDXs = LOUT_IDXs;
Low_Surv = 1; % look at the survival curves resulted from step4 to set the ID of Poor and High Survival subpop
High_Surv = 3;
% High_Surv = 3;
Low_i = find(ismember(NewLabels_IDXs,Low_Surv)==1);
High_i = find(ismember(NewLabels_IDXs,High_Surv)==1);
All_Indices = [1:length(NewLabels_IDXs)]';
Medium_i = setdiff(All_Indices, [Low_i;High_i]);

NewLabels_IDXs(Low_i) = 1; % Low Survivals labeled as 1
NewLabels_IDXs(Medium_i) = 2; % Medium Survivals labeled as 2
NewLabels_IDXs(High_i) = 3;   % High Survivals labeled as 3
Classification_LOUT 

%% LOPO.7: As a validation step build a new tSNE map using one a shortend 
% % % % % feature vector (i.e. m/z fetaures >= 80% LOPO features). 
% % % % % Color this new tSNE map using the original cluster labels (IDXs) you get
% % % % % from running the full dataset experiment from "Run_Gastric.m"
% % % % 
% % % % %  Load the file "SAM_LOPO.mat" you got from  step LOPO.5 and then choos
% % % % %  those m/z features that appear at least in 80% of LOPO runs. You will
% % % % %  find it as exactly as the following:
% % % % load('SAM_LOPO.mat');
% % % % MZ_Values_S = sort(MZ_Values);
% % % % A_UniqueSAMFeatures = unique(MZ_Values_S)' ;
% % % % A_CountSAMFeatures = histc(MZ_Values, A_UniqueSAMFeatures)';
% % % % % ****** Find Those Features that occured more than 80% *********
% % % % N_Patients = size(sample_data,2);
% % % % Minimum_Desired_Counts = 0.8*N_Patients; % frequency threshold for m/z features that appeard in at least 80% of LOPO runs
% % % % ind = find(A_CountSAMFeatures >= Minimum_Desired_Counts);
% % % % Top_80Percent_SAMFEatures = A_UniqueSAMFeatures(ind)
% % % % Mzs = Top_80Percent_SAMFEatures;
% % % % 
% % % % % Run a 2D tSNE using only Mzs
% % % % for i=1:length(Mzs)
% % % % ij = find (master_peak_list2(:,1) ==Mzs(i));
% % % % DataCubeProg(:,i) = Tumors_All(:,ij);
% % % % end
% % % % tSNEBPrognostic1_2D = fast_tsne_seed(DataCubeProg, 2);
% % % % tSNEBPrognostic2_2D = fast_tsne_seed(DataCubeProg, 2, [], [], [], [], tSNEBPrognostic1_2D);
% % % % save(strcat(FigureFolder,'2D_tSNE.mat'),'tSNEBPrognostic2_2D');
% % % % 
% % % % %Color the tSNE map using the proteomic expression for each of Mzs
% % % % for i=1:length(Mzs)
% % % % ij = find (master_peak_list2(:,1) ==Mzs(i));
% % % % mz_eq = imadjust(Tumors_All(:,ij)); 
% % % % hf = figure; scatter(tSNEBPrognostic2_2D(:,1),tSNEBPrognostic2_2D(:,2),3,Tumors_All(:,ij));
% % % % colormap(hot);
% % % % end
% % % % 
% % % % %Color the tSNE map using original cluster labels derived from running the
% % % % %original experiment from the file "Run_Gastric.m". From this file get the
% % % % % values of IDXs and RGB_COLORS_OfClusters (Look at lines 44 and 56 in Run_Gastric.m)
% % % % hh= figure;
% % % % scatter(tSNEBPrognostic2_2D(:,1),tSNEBPrognostic2_2D(:,2),3,IDXs); colormap(RGB_COLORS_OfClusters./255);
